---
title: "Smoking and Multiple Sclerosis risk in Black people: a nested case-control study"
author: "Vinicius A. Schoeps, Marianna Cortese, Kassandra L. Munger, James D. Mancuso, David W. Niebuhr, Xiaojing Peng, Alberto Ascherio, Kjetil Bjornevik"
date: "`r Sys.Date()`"
output: 
  html_document:
    toc: TRUE
    toc_float: true
    number_sections: true
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
knitr::opts_knit$set(root.dir = here::here())
```

```{r packages}
library(tidyverse)
library(tidylog, warn.conflicts = FALSE)
library(tableone)
library(survival)
library(broom)
library(RColorBrewer)
library(cowplot)
```

# Loading and appending datasets
```{r data management 1, results='hide'}
army56 <- read_csv("data/cotinine_mph_army56.csv")
army2 <- read_csv("data/cotinine_mph_army2.csv")
army562 <- army2 %>% bind_rows(army56)
```

# Data cleaning
```{r data management 2, results='hide'}
# Missing age 
age_miss <- which(army562 %>% select(ageonset) %>% is.na())
age_values <- c(35, 35)   # Not time-varying and available for the other
age_full <- replace(army562$ageonset, age_miss, age_values)

# Missing yrser_ons
yrser_miss <- which(army562 %>% select(yrser_ons) %>% is.na())
yrser_values <- c(3, 3)   # Calculated field
yrser_full <- replace(army562$yrser_ons, yrser_miss, yrser_values)

# Median imputation for missing cotinine
cot_miss <- which(army562 %>% select(cot_ngml) %>% is.na())
cot_values <- c(median(army2$cot_ngml, na.rm = TRUE), median(army2$cot_ngml, na.rm = TRUE))  
    # From army's 2 median levels - are equal for cases and controls
cot_full <- replace(army562$cot_ngml, cot_miss, cot_values)

army562_clean <- army562 %>% mutate(
                    ageonset = age_full,
                    yrser_ons = yrser_full,
                    cot_ngml = cot_full)

which(army562_clean %>% select(colnames(army562_clean)[1:12]) %>% is.na())
which(army562_clean %>% select(ebv_pos) %>% is.na())
    # All the working variables are complete, except EBV that is not going to be used in the analysis and therefore will be ignored

# Creating factors
army562_clean <- army562_clean %>% mutate(
                  race = factor(race),
                  timeser = factor(timeser),
                  casestat = factor(casestat),
                  casestat = fct_recode(casestat,
                                                "Controls" = "0",
                                                "Cases" = "1"),
                  female = factor(female),
                  ebv_pos = factor(ebv_pos)
)

# Drop "case only" pairs (IDs 76, 111, 172, 175) and non B or W races
army562_filter <- army562_clean %>% filter(
                    group_id != 76,
                    group_id != 111,
                    group_id != 172,
                    group_id != 175,
                    race != "H",
                    race != "O") %>% mutate(race = factor(race),
                                            race = fct_recode(race,
                                                        "Black" = "B",
                                                        "White" = "W")
) 

```

# Variable creation
```{r data management 3, results='hide'}
# Samples close to baseline (2yrs, 5yrs) + cotinine thresholds
army562_filter <- army562_filter %>% mutate(
          close_base_ser = case_when(yrser_ons<=1 & serumn==1 ~ 1,
                                     TRUE ~ 0),
          close_base_ser2 = case_when(yrser_ons<5 & serumn==1 ~ 1,
                                     TRUE ~ 0),
          cot_25 = case_when(cot_ngml>25 ~ 1,
                             TRUE ~ 0),
          cot_13 = case_when(cot_ngml>13 ~ 1,
                             TRUE ~ 0)
)

# Wide format
army562_filter_p <- army562_filter %>% pivot_wider(
                      names_from = timeser,
                      values_from = c(serumn, ageser, yrser_ons, cot_ngml, vit_d_raw, ebv_pos, close_base_ser, close_base_ser2, cot_25, cot_13)
)

# Smoke intensity variables
army562_filter_p <- army562_filter_p %>% mutate(
                      eversmoker25 = case_when(cot_25_index ==1 | cot_25_last ==1 ~ 1,
                                               cot_25_index ==0 & cot_25_last ==0 ~ 0,
                                               TRUE ~ NA_real_),
                      eversmoker13 = case_when(cot_13_index ==1 | cot_13_last ==1 ~ 1,
                                               cot_13_index ==0 & cot_13_last ==0 ~ 0,
                                             TRUE ~ 0),
                      smokeintensity = case_when(cot_25_index ==0 & cot_25_last ==0 ~ 0,
                                                 cot_25_index ==1 & cot_25_last ==1 ~ 2,
                                                 cot_25_index ==1 & cot_25_last ==0 ~ 1,
                                                 TRUE ~ NA_real_),
                      smokeintensity = factor(smokeintensity),
                      smokeintensity = fct_recode(smokeintensity,
                                                        "Never Smoker" = "0",
                                                        "Previous Smoker" = "1",
                                                        "Current Smoker" = "2"),
                      eversmoker25_baseline = case_when(cot_25_index ==1 ~ 1,
                                             TRUE ~ 0)
                      
)

army562_filter_p <- army562_filter_p %>% rowwise(id)
army562_filter_p <- army562_filter_p %>% mutate(
                      cot_average = mean(c(cot_ngml_index, cot_ngml_last), na.rm = TRUE))

army562_filter_p <- army562_filter_p %>% mutate(
                        averagesmoker = case_when(cot_average>25 ~ 1,
                                             TRUE ~ 0),
                        averagesmoker2 = case_when(cot_average<10 ~ 0,
                                                   cot_average>25 ~ 1,
                                                  TRUE ~ NA_real_)
)

# Log10 cotinine
army562_filter_p <- army562_filter_p %>% mutate(
                      log10_cot_ngml_lindex = log10(cot_ngml_index),
                      log10_cot_ngml_last = log10(cot_ngml_last),
                      log10_cot_average = log10(cot_average)
)

# Baseline vitamin D quartiles - based on black control distribution
army562_filter_p %>% ungroup() %>%  filter(casestat==0 & race=="Black") %>% summarize(
      value = quantile(vit_d_raw_index, probs = c(0.25, 0.5, 0.75)),
      percentile = c(0.25, 0.5, 0.75))

army562_filter_p <- army562_filter_p %>% mutate(
  vitd_quantile = case_when(
                      vit_d_raw_index <=36.9765 & race=="Black"~1,
                      vit_d_raw_index >36.9765 & vit_d_raw_index<=47.7333 & race=="Black"~2,
                      vit_d_raw_index >47.7333 & vit_d_raw_index<=63.0219	& race=="Black"~3,
                      vit_d_raw_index >63.0219 & race=="Black"~4,
                      TRUE ~ NA_real_))

table(army562_filter_p$vitd_quantile)
    # Note: there is a slight discrepancy between Stata and R's classification. 
      # Stata: 94, 81, 78, 89       R: 94, 80, 79, 89
      # Could not solve after changing the position of the = signs

```

# Table 1 
```{r table 1, results='hide'}
# Renaming variables for display
army562_filter_p_display <- army562_filter_p %>% rename("Sex"="female",
                                                        "First serum sample collection"="ageser_index",
                                                        "Second serum sample collection"="ageser_last",
                                                        "MS onset"="ageonset",
                                                        "Baseline vitamin D levels, nmol/mL"="vit_d_raw_index",
                                                        "Baseline EBV VCA IgG seropositivity"= "ebv_pos_index")                             %>% 
                            mutate(Sex = factor(Sex),
                                   Sex = fct_recode(Sex,"Male" = "0","Female" = "1"),
                                   Sex = fct_relevel(Sex, "Female", after=Inf),
                                   `Baseline EBV VCA IgG seropositivity` = factor(`Baseline EBV VCA IgG seropositivity`),
                                   `Baseline EBV VCA IgG seropositivity` = fct_recode(`Baseline EBV VCA IgG seropositivity`,"Absent" = "0","Present" = "1"),
                                   casestat = fct_relevel(casestat, "Controls", after=Inf))

table1 <- CreateTableOne(
  data = army562_filter_p_display,
  vars = c("Sex","First serum sample collection","Second serum sample collection", "MS onset", "Baseline vitamin D levels, nmol/mL", "Baseline EBV VCA IgG seropositivity"),
  strata = c("casestat", "race"),
  factorVars = c("Sex","Baseline EBV VCA IgG seropositivity"),
  test = FALSE)
table1 <- print(table1, catDigits=2, contDigits=2, nonnormal = c("First serum sample collection","Second serum sample collection","MS onset","Baseline vitamin D levels, nmol/mL"), missing=TRUE, showAllLevels = TRUE)
```
```{r}
kableone(table1)
```
*129 cases were matched on 1:1 basis whereas 28 cases on a 1:2

** Individuals with 2 pre-symptomatic samples – unavailable for 12 Black cases and 17 controls, 9 White cases and 18 controls
Abbreviations: EBV: Epstein-Barr Virus, IQR: Inter Quartile Range, NA: not applicable


# Figure 1
```{r figure 1, fig.keep="last"}
# Setting theme
theme_set(theme_bw() +
            #eliminates background, gridlines, and chart border
            theme(text = element_text(size=15),
                  plot.title = element_text(size = 12, face = "bold",hjust = 0.5),
                  plot.subtitle = element_text(size = 10,hjust = 0.5),
                  axis.line = element_line(colour = "black",size =0.5),
                  plot.background = element_blank(),
                  panel.grid.major = element_blank(),
                  panel.grid.minor = element_blank(),
                  axis.text.x = element_text(colour = "black"),
                  axis.text.y = element_text(colour = "black"),
                  panel.border =  element_blank()))

# Setting colors
mypalette <- brewer.pal(2, "Paired")

# Subset dataset
army562_fig1 <- army562_filter_p %>% select(id, casestat, race, cot_25_index, cot_25_last)

# Long format
army562_fig1 <- army562_fig1 %>% pivot_longer(
                      cols= c(cot_25_index, cot_25_last),
                      names_to = "time",
                      values_to = "smoking_stat"
)

# Setting variable names
army562_fig1 <- army562_fig1 %>% mutate(
                        time = fct_recode(time,
                                              "Baseline" = "cot_25_index",
                                              "Follow-up" = "cot_25_last",)
)

# Generate proportion of smoker per case status, race and time point
army562_fig1 <- army562_fig1 %>% 
                group_by(race, time, casestat, smoking_stat) %>% 
                summarize (n = n()) %>% na.omit() %>%  
                mutate(prop = n/sum(n)) %>% 
                filter(smoking_stat==1)

army562_fig1 <- army562_fig1 %>% select(race, time, casestat, prop)

army562_fig1_black <- army562_fig1 %>% filter(race=="Black")
army562_fig1_white <- army562_fig1 %>% filter(race=="White")

# Bar graph - Blacks
g1a <- ggplot(data = army562_fig1_black) +
    geom_col(aes(x = time, y = prop*100,
                   fill = casestat), position="dodge") +
  theme(axis.text.x = element_text(angle = 45, 
                                   vjust = 1, 
                                   hjust = 1)) +
  scale_y_continuous(limits = c(0, 60), 
                     breaks = c(0, 10, 20, 30, 40, 50, 60)) +
  scale_fill_discrete(name = "", labels = c("Controls", "MS")) +
  scale_fill_manual(name= "", values = mypalette) +
  labs(x = "",
       y = "",
       title = "",
       subtitle = "")

legend <- get_legend(g1a + 
                     guides(color = guide_legend(nrow = 1)))

g1a <- ggplot(data = army562_fig1_black) +
    geom_col(aes(x = time, y = prop*100,
                   fill = casestat), position="dodge") +
  theme(axis.text.x = element_text(angle = 45, 
                                   vjust = 1, 
                                   hjust = 1)) +
  scale_y_continuous(limits = c(0, 60), 
                     breaks = c(0, 10, 20, 30, 40, 50, 60),
                     labels = scales::label_percent(scale=1)) +
  scale_fill_discrete(name = "", labels = c("Controls", "MS")) +
  theme(legend.position = "none") +
  scale_fill_manual(name= "", values = mypalette) +
  labs(x = "",
       y = "",
       title = "",
       subtitle = "")

# Bar graph - Whites
g1b <- ggplot(data = army562_fig1_white) +
    geom_col(aes(x = time, y = prop*100,
                   fill = casestat), position="dodge") +
  theme(axis.text.x = element_text(angle = 45, 
                                   vjust = 1, 
                                   hjust = 1)) +
  scale_y_continuous(limits = c(0, 60), 
                     breaks = c(0, 10, 20, 30, 40, 50, 60),
                     labels = scales::label_percent(scale=1)) +
  scale_fill_discrete(name = "", labels = c("Controls", "MS")) +
  theme(legend.position = "none") +
  scale_fill_manual(name= "", values = mypalette) +
  labs(x = "",
       y = "",
       title = "",
       subtitle = "")

# Combine plots
g1 <- plot_grid(g1a, g1b,
          labels = c("A", "B"))

g1 <- plot_grid(g1, legend,ncol=2,rel_widths = c(1, .2))

# Exporting
ggsave (filename = "figures/Figure1.pdf",
        plot = g1,
        dpi = 300)

g1
``` 


**Figure 1**: proportion of smokers in cases and controls using a 25ng/mL cotinine threshold. **A**. Black people, **B**. White people 

# Association between smoking and MS diagnosis
## Black individuals
```{r table 2a}
# Setting Wald's 95% CI function
ci_func <- function(estimate, lci, uci) {
  OR <- round(exp(estimate), 2)
  lci <- round(exp(lci), 2)
  uci <- round(exp(uci), 2)
  to_print <- str_glue("{OR} (95% CI {lci}, {uci})")
  return(to_print)
}

# Subsetting
army562_filter_p <- army562_filter_p %>% ungroup()
army562_blacks <- army562_filter_p %>% filter(race=="Black")

# Model 1
## 1. Ever smoker adjusted by matching variables only
results_1 <- clogit(as.numeric(casestat) ~ eversmoker25 + strata(group_id), data=army562_blacks)   %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1. Ever smoker ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 1a. Further adjusted by raw vitamin D (cont.)
results_1a <- clogit(as.numeric(casestat) ~ eversmoker25 + vit_d_raw_index + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1a. Further adjusted by raw vitamin D (cont.)")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value) %>% 
  head(1)

## 1b. Further adjusted by raw vitamin D (quartiles)
results_1b <- clogit(as.numeric(casestat) ~ eversmoker25 + factor(vitd_quantile) + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1b. Further adjusted by raw vitamin D (quartiles)")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value) %>% 
  head(1)

## 1c. Lower cotinine threshold (13ng/mL)
results_1c <- clogit(as.numeric(casestat) ~ eversmoker13 + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1c. Lower cotinine threshold (13ng/mL)")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


## Merging model 1 results
model1_blacks <- results_1 %>% bind_rows(results_1a) %>% bind_rows(results_1b) %>% bind_rows(results_1c)
knitr::kable(model1_blacks, caption = "Model 1: Association between presymptomatic smoking status and MS diagnosis (Black individuals)")


# Model 2: Previous smoker vs Never smoker and Current smoker vs Never smoker
results_2 <- clogit(as.numeric(casestat) ~ smokeintensity + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = c("2. Previous smoker vs Never smoker", "2. Current smoker vs Never smoker"))  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases" & smokeintensity!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls" & smokeintensity!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 3: Average serum cotinine (>25ng/mL vs ≤25ng/mL) adjusted by matching variables only
results_3 <- clogit(as.numeric(casestat) ~ averagesmoker + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "3. Average serum cotinine (>25ng/mL vs ≤25ng/mL) ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases" & averagesmoker!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls" & averagesmoker!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 4: Average serum cotinine (>25ng/mL vs <10ng/mL) adjusted by matching variables only
results_4 <- clogit(as.numeric(casestat) ~ averagesmoker2 + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "4. Average serum cotinine (>25ng/mL vs <10ng/mL) ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases" & averagesmoker2!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls" & averagesmoker2!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 5: Baseline smoker
results_5 <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_blacks)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5. Baseline smoker")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 5a. Only baseline samples collected ≥ 2 yrs before MS Diagnosis
army562_blacks_2y <- army562_blacks %>% filter(close_base_ser_index!=1)

results_5a <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_blacks_2y)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5a. Only baseline samples collected ≥ 2 yrs before MS Diagnosis")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks_2y %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks_2y %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 5b. Only baseline samples collected ≥ 5 yrs before MS Diagnosis
army562_blacks_5y <- army562_blacks %>% filter(close_base_ser2_index!=1)

results_5b <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_blacks_5y)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5b. Only baseline samples collected ≥ 5 yrs before MS Diagnosis")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_blacks_5y %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_blacks_5y %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## Table 2a: model 1-5 in Blacks
table2a <- results_1 %>% bind_rows(results_2) %>% bind_rows(results_3) %>% bind_rows(results_4) %>% bind_rows(results_5) %>% bind_rows(results_5a) %>% bind_rows(results_5b)
knitr::kable(table2a, caption = "Table 2a: Association between presymptomatic smoking status and MS diagnosis (Black individuals)")

```


## White individuals
```{r table 2b}
# Subsetting
army562_whites <- army562_filter_p %>% filter(race=="White")

# Model 1
## 1. Ever smoker adjusted by matching variables only
results_1 <- clogit(as.numeric(casestat) ~ eversmoker25 + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1. Ever smoker ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 1a. Further adjusted by raw vitamin D (cont.)
results_1a <- clogit(as.numeric(casestat) ~ eversmoker25 + vit_d_raw_index + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1a. Further adjusted by raw vitamin D (cont.)")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value) %>% 
  head(1)

## 1b. Further adjusted by raw vitamin D (quartiles)
### Purposefully omitted

## 1c. Lower cotinine threshold (13ng/mL)
results_1c <- clogit(as.numeric(casestat) ~ eversmoker13 + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "1c. Lower cotinine threshold (13ng/mL)")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


## Merging model 1 results
model1_whites <- results_1 %>% bind_rows(results_1a) %>% bind_rows(results_1c)
knitr::kable(model1_whites, caption = "Model 1: Association between presymptomatic smoking status and MS diagnosis (White individuals)")


# Model 2: Previous smoker vs Never smoker and Current smoker vs Never smoker
results_2 <- clogit(as.numeric(casestat) ~ smokeintensity + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = c("2. Previous smoker vs Never smoker", "2. Current smoker vs Never smoker"))  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases" & smokeintensity!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls" & smokeintensity!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 3: Average serum cotinine (>25ng/mL vs ≤25ng/mL) adjusted by matching variables only
results_3 <- clogit(as.numeric(casestat) ~ averagesmoker + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "3. Average serum cotinine (>25ng/mL vs ≤25ng/mL) ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases" & averagesmoker!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls" & averagesmoker!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 4: Average serum cotinine (>25ng/mL vs <10ng/mL) adjusted by matching variables only
results_4 <- clogit(as.numeric(casestat) ~ averagesmoker2 + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "4. Average serum cotinine (>25ng/mL vs <10ng/mL) ")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases" & averagesmoker2!="NA") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls" & averagesmoker2!="NA") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


# Model 5: Baseline smoker
results_5 <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_whites)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5. Baseline smoker")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 5a. Only baseline samples collected ≥ 2 yrs before MS Diagnosis
army562_whites_2y <- army562_whites %>% filter(close_base_ser_index!=1)

results_5a <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_whites_2y)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5a. Only baseline samples collected ≥ 2 yrs before MS Diagnosis")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites_2y %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites_2y %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)

## 5b. Only baseline samples collected ≥ 5 yrs before MS Diagnosis
army562_whites_5y <- army562_whites %>% filter(close_base_ser2_index!=1)

results_5b <- clogit(as.numeric(casestat) ~ eversmoker25_baseline + strata(group_id), data=army562_whites_5y)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "5b. Only baseline samples collected ≥ 5 yrs before MS Diagnosis")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  mutate(Cases = army562_whites_5y %>% filter(casestat=="Cases") %>% summarize(n()) %>% as.numeric(),
         Controls = army562_whites_5y %>% filter(casestat=="Controls") %>% summarize(n()) %>% as.numeric()) %>% 
  select(Model, Cases, Controls, OR, p.value)


## Table 2b: model 1-5 in Whites
table2b <- results_1 %>% bind_rows(results_2) %>% bind_rows(results_3) %>% bind_rows(results_4) %>% bind_rows(results_5) %>% bind_rows(results_5a) %>% bind_rows(results_5b)
knitr::kable(table2b, caption = "Table 2b: Association between presymptomatic smoking status and MS diagnosis (White individuals)")

```
Abbreviations: CI: Confidence Interval, RR: Rate Ratio, NA: not applicable.

Individuals with cotinine levels above 25 ng/mL in any pre-symptomatic samples were defined as ever-smokers. Additional levels according to the time points in which samples that have reached the 25ng/mL threshold: never smokers (no samples), previous smokers (earliest sample only), and current smokers (earliest and latest samples). Baseline smokers have reached the 25ng/mL threshold at their earliest sample, but no conditions were applied for follow-up samples.

Models were adjusted by matching variables (age, sex, race and ethnicity, dates of sample collection, and branch of military service) by design.

# Interactions
```{r interactions, results='hide'}
army562_filter_p2 <- army562_filter_p %>% mutate(
                      casestat = as.numeric(casestat),
                      casestat = case_when(casestat ==1 ~ 0,
                                           casestat ==2 ~ 1,
                                           TRUE ~ NA_real_),
                      race = fct_relevel(race,"Black", after=Inf))
                        # White as the reference category

# Multiplicative
## Race and smoking
int_r_s <- glm(casestat ~ eversmoker25*race + eversmoker25 + race + ageonset + female + ageser_index, family = binomial(link="logit"), data=army562_filter_p2)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "Race and smoking")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  select(Model, OR, p.value) %>% slice(3)

## Sex and smoking
int_s_s <- glm(casestat ~ eversmoker25*female + eversmoker25 + female + ageonset + race + ageser_index, family = binomial(link="logit"), data=army562_filter_p2)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "Sex and smoking")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  select(Model, OR, p.value) %>% slice(3)

### Sex and smoking in Blacks only
army562_filter_p2_b <- army562_filter_p2 %>% filter(race=="Black")

int_s_sb <- glm(casestat ~ eversmoker25*female + eversmoker25 + female + ageonset + ageser_index, family = binomial(link="logit"), data=army562_filter_p2_b)    %>% tidy() %>% 
  filter(term != "(Intercept)") %>% 
  mutate(Model = "Sex and smoking in blacks only")  %>%
  mutate(lci = estimate - 1.96*std.error,
         uci = estimate + 1.96*std.error,
         OR = ci_func(estimate, lci, uci),
         p.value = scales::pvalue(p.value)) %>%
  select(Model, OR, p.value) %>% slice(3)

## Table 4: multiplicative interactions
table4 <- int_r_s %>% bind_rows(int_s_s) %>% bind_rows(int_s_sb) 


# Additive
  ## Mathur MB & VanderWeele TJ (2018). R function for additive interaction measures. Epidemiology 29(1), e5-e6.
source("Additive Interaction/Additive interactions function.R")

## Race and smoking
int_r_s_add <- glm(casestat ~ eversmoker25*race + eversmoker25 + race + ageonset + female + ageser_index, family = binomial(link="logit"), data=army562_filter_p2) 

ci_func2 <- function(estimate, lci, uci) {
  Estimate <- round(estimate, 2)
  lci <- round(lci, 2)
  uci <- round(uci, 2)
  to_print <- str_glue("{Estimate} (95% CI {lci}, {uci})")
  return(to_print)
}

int_r_s_add <- additive_interactions(int_r_s_add, dat=NULL, CI.level=0.95, recode=FALSE) %>% 
                mutate(` ` = Stat,
                       Estimate = ci_func2(Est,CI.lo,CI.hi),
                       p.value = scales::pvalue(p.val.0)) %>% 
                slice(c(1,2)) %>% 
                select(` `, "Estimate","p.value")
```
```{r interaction tables,}
knitr::kable(table4, caption = "Table 4: Multiplicative Interactions")
knitr::kable(int_r_s_add, caption = "Table 5: Additive Interaction (Race and Smoking)")
```
Note: White race is the reference category for tables 4 and 5

# Session Info
```{r session infor, echo=FALSE}
sessionInfo()
```

